############       Enumerative Reading     ################
## This code accompanies the article:
#Andrew Piper, "Enumerative," Further Reading, ed. Leah Price and Matthew Rubery (Oxford UP, 2020)

## Assess difficulty using Flesh reading ease and Tuldava's readability scores
#the calculation of reading ease was done using the koRpus package in R

#### 19C ####

#difficulty scores
a<-read.csv("Difficulty_Stanford.csv")
b<-read.csv("Difficulty_Stanford2.csv")
meta<-read.csv("Stanford.csv")

#reduce first table to 20 samples
diff.df<-NULL
for (i in 1:nlevels(a$work)){
  sub<-a[a$work == levels(a$work)[i],]
  sub<-sub[sample(nrow(sub), 20),]
  diff.df<-rbind(diff.df, sub)
}

#remove tuldava
diff.df<-diff.df[,-3]

#combine
c<-rbind(diff.df, b)
c<-c[,-1]

#measure decade-level difficulty levels

#create decade
meta$decade<-as.factor(paste(substr(meta$pubdate, 1, 3), "0", sep=""))
#subset table by known dates
d<-c[as.character(c$work) %in% as.character(meta$filename),]
#attach decades to difficulty table
final.df<-NULL
for (i in 1:nlevels(as.factor(d$work))){
  sub<-d[as.character(d$work) == levels(as.factor(as.character(d$work)))[i],]
  sub$decade<-meta[as.character(meta$filename) == as.character(sub$work)[1],8]
  final.df<-rbind(final.df, sub)
}

#plot
boxplot(final.df$flesch.v ~ final.df$decade, outline=F)

#run bootstrap

boot.mean<-function(data, num){
  resamples<-lapply(1:num, function(i) sample(data, replace=T))
  r.mean<-sapply(resamples, mean)
  return(r.mean)
}

boot.df<-NULL
for (i in 2:nlevels(factor(final.df$decade))){
  sub<-final.df[final.df$decade == levels(factor(final.df$decade))[i],]
  boot.sub<-boot.mean(sub$flesch.v, 1000)
  decade<-levels(factor(final.df$decade))[i]
  temp.df<-data.frame(decade, boot.sub)
  boot.df<-rbind(boot.df, temp.df)
}

boxplot(boot.df$boot.sub ~ boot.df$decade)

### 20C ###

klab<-read.csv("Difficulty_KLAB.csv")
klab.m<-read.csv("KLAB_RANDOM_META.csv")
#create decade
klab.m$decade<-as.factor(paste(substr(klab.m$PUBL_DATE, 1, 3), "0", sep=""))
#subset table by known dates
#k2<-klab[as.character(c$work) %in% as.character(meta$filename),]
#attach decades to difficulty table
klab.df<-NULL
for (i in 1:nlevels(as.factor(klab$work))){
  sub<-klab[as.character(klab$work) == levels(as.factor(as.character(klab$work)))[i],]
  sub$decade<-klab.m[as.character(klab.m$FILENAME) == as.character(sub$work)[1],17]
  klab.df<-rbind(klab.df, sub)
}

#plot
boxplot(klab.df$flesch.v ~ klab.df$decade, outline=F)

#run bootstrap

boot.mean<-function(data, num){
  resamples<-lapply(1:num, function(i) sample(data, replace=T))
  r.mean<-sapply(resamples, mean)
  return(r.mean)
}

boot.df2<-NULL
for (i in 3:(nlevels(factor(klab.df$decade))-1)){
  sub<-klab.df[klab.df$decade == levels(factor(klab.df$decade))[i],]
  boot.sub<-boot.mean(sub$flesch.v, 1000)
  decade<-levels(factor(klab.df$decade))[i]
  temp.df<-data.frame(decade, boot.sub)
  boot.df2<-rbind(boot.df2, temp.df)
}

boxplot(boot.df2$boot.sub ~ boot.df2$decade)



######### Collocates of Reading ###########
#This script extracts collocates of the root "read" using +/- 5 word windows

#this script takes 33-year intervals and calculates the most common words
#collocated with "read*"; it adjusts raw counts by pointwise mutual information

#load libraries

library(tm)
library(SnowballC)
library(slam)

#set your working directory
wd<-c("~/Data")

#set your folder where you files are stored
texts<-c("19C_Stanford")

#get filenames from directory of texts you want to examine
setwd(wd)
filenames<-list.files(texts, pattern="*.txt", full.names=FALSE)

#ingest metadata
a<-read.csv("Stanford.csv")

#input a keyword you wish to analyze
dict<-c("read", "reader", "reads", "reading", "reader")

#establish 33-year segments
seg1<-a[a$pubdate < 1834,]
seg2<-a[a$pubdate > 1833 & a$pubdate < 1867,]
seg3<-a[a$pubdate > 1866,]

#establish base word frequencies

#read in corpus1
corpus1 <- VCorpus(DirSource(texts, encoding = "UTF-8"), readerControl=list(language="English"))

#clean data
corpus1 <- tm_map(corpus1, content_transformer(stripWhitespace))
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, content_transformer(removeNumbers))
#corpus1 <- tm_map(corpus1, removeWords, stopwords("English"))
corpus1 <- tm_map(corpus1, content_transformer(removePunctuation))
corpus1 <- tm_map(corpus1, stemDocument, language = "english")

#create document term matrix
corpus1.dtm<-DocumentTermMatrix(corpus1, control=list(wordLengths=c(1,Inf)))

#perform transformations on the raw counts
scaling1<-row_sums(corpus1.dtm) #get total word counts for each work

#subset filenames by segment
filenames1<-filenames[as.character(filenames) %in% as.character(seg1$filename)]
#filenames1<-filenames[as.character(filenames) %in% as.character(seg2$filename)]
#filenames1<-filenames[as.character(filenames) %in% as.character(seg3$filename)]

#subset DTM by segments
dtm1<-corpus1.dtm[row.names(corpus1.dtm) %in% as.character(seg1$filename),]
#dtm1<-corpus1.dtm[row.names(corpus1.dtm) %in% as.character(seg2$filename),]
#dtm1<-corpus1.dtm[row.names(corpus1.dtm) %in% as.character(seg3$filename),]

#now extract collocates from every file in each segment

#set your working directory to where your files are stored
setwd(paste(wd, texts, sep="/"))

#set the length of your window of words for collocates (the number entered is +/- so 2x the number)
#shorter windows capture more specific collocates (+/- 1 for example captures only words directly next to a keyword)
window<-5 #5 means that you will have a window of +/- 5 words overall
total.gram<-vector()
for (i in 1:length(filenames1)) {
  print(i)
  #scan in work and clean
  work<-scan(filenames1[i], what="character", quote="", quiet = T)
  #clean numbers
  work.clean<-gsub("\\d", "", work)
  #clean puncutation
  work.clean<-gsub("[[:punct:]]","", work.clean)
  #tolower
  work.clean<-tolower(work.clean)
  #remove a word like "page"
  work.clean<-gsub("page", "", work.clean)
  work.clean<-work.clean[work.clean != ""]
  #remove stopwords
  remove.w<-stopwords("en")
  remove.w<-gsub("[[:punct:]]","", remove.w)
  word.count<-length(work.clean)
  work.clean<-subset(work.clean, !(work.clean %in% remove.w)) #optional: removes stopwords
  #stem
  work.clean<-wordStem(work.clean, language="en")
  #capture exact match
  work.ont<-which(work.clean %in% dict) # position of keyword
  work.collocates<-NULL
  for (j in 1:length(work.ont)){
    if (length(work.ont) > 0){
      #establish window
      after<-work.ont[j]+window
      before<-work.ont[j]-window
      #make sure not too close to end / beginning
      if (before <= 0) {before <- (window-1)-(abs(before))}
      if (after > length(work.clean)) {after <- after-(after-length(work.clean))}
      shift<-work.ont[j]+1
      #capture words before and after keyword
      before.gram<-work.clean[before:(work.ont[j]-1)] # finds next X positions after is
      after.gram<-work.clean[shift:after] # finds previous X positions after is
      #append to a vector
      combined.gram<-append(before.gram, after.gram)
      #append to total vector
      total.gram<-append(total.gram, combined.gram)
    }
  }
}

#the output total.gram = a list of words associated with your keyword

#rank your collocates by frequency
collocate.df<-data.frame(sort(table(total.gram), decreasing = T))

#transform by PMI value (pointwise mutual information)
#PMI = log (P(x,y)/P(x)*P(y)) OR P(y|x)/P(y) where y = collocate and x = keyword
#P(y|x) = collocate frequency / keyword frequency
#P(y) = frequency of y

#first remove keyword(s) from collocates
collocate.df2<-collocate.df[!collocate.df$total.gram %in% dict,]

#then remove words under N occurrences
collocate.df2<-collocate.df2[collocate.df2$Freq > 24,]

#then go through each row and transform by PMI
pmi<-vector()
for (i in 1:nrow(collocate.df2)){
  print(i)
  #probability of keyword(s)
  px<-sum(dtm1[,colnames(dtm1) %in% dict])/sum(scaling1)
  #probability of collocate
  py<-sum(dtm1[,colnames(dtm1) == as.character(collocate.df2$total.gram[i])])/sum(scaling1)
  #probability of keyword-collocate pair
  #pxy<-collocate.df2[i,2]/sum(corpus1.matrix)
  #conditional probability of P(y|x)
  py.x<-(collocate.df2[i,2]/sum(scaling1))/px
  #PMI
  pmi.score<-log(py.x/py)
  pmi<-append(pmi, pmi.score)
}

#append pmi vector to collocate table
collocate.df2$pmi<-pmi
collocate.df3<-collocate.df2[order(-collocate.df2$pmi),]

#write to disk
setwd(wd)
write.csv(collocate.df3, file="Reading_1800_1833.csv", row.names = F)

####### Visualize #######
library(wordcloud)

#get total word counts
a<-read.csv("Reading_WordCount.csv")
filenames<-list.files("Reading_Tables")
setwd("~/Documents/1. Articles/Enumerative - Oxford UP 2018/Reading_Tables")

#model 1 = most frequent collocates

for (i in 1:length(filenames)){
  sub<-read.csv(filenames[i])
  sub<-sub[sub$Freq > a$per.million.adj[i],]
  sub<-sub[1:20,]
  sub<-sub[order(-sub$Freq),]
  sub2<-sub[1:5,]
  wordcloud(as.character(sub2$total.gram), sub2$Freq, min.freq = 0, random.order = F, rot.per = 0, family = "serif", font=1)
}

#comparison by period
final.df<-NULL
for (i in 1:(length(filenames)-1)){
  sub1<-read.csv(filenames[i])
  sub1<-sub1[sub1$Freq > 70,]
  sub2<-read.csv(filenames[i+1])
  sub2<-sub2[sub2$Freq > 70,]
  #keep words in top 20 of next period that are NOT in previous period
  sub3<-sub2[1:20,][!as.character(sub2$total.gram)[1:20] %in% as.character(sub1$total.gram)[1:20],]
  sub3$period<-filenames[i+1]
  final.df<-rbind(final.df, sub3)
}



